import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import plotly.io as pio
pio.renderers.default='notebook'
# load the dataset into a pandas dataframe
df = pd.read_csv("../data/lending_club_modified_data.csv", index_col=False, dtype='unicode')
# have a quick glance at the dataframe
df.head()
| id | term | grade | emp_title | emp_length_years | purpose | loan_status | loan_type | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1077501 | 36 months | B | employer not mentioned | 10 | credit_card | Fully Paid | Good Loans |
| 1 | 1077430 | 60 months | C | ryder | 0 | car | Charged Off | Bad Loans |
| 2 | 1077175 | 36 months | C | employer not mentioned | 10 | small_business | Fully Paid | Good Loans |
| 3 | 1076863 | 36 months | C | air resources board | 10 | other | Fully Paid | Good Loans |
| 4 | 1075358 | 60 months | B | university medical group | 1 | other | Current | Good Loans |
# function to create dataframe for pie chart
def build_df(var_name):
# group the data based on var_name and store it as a dictionary
data = dict(df[var_name].value_counts())
# convert dictionary to pandas dataframe
data = pd.DataFrame(data.items(),columns = [var_name,'count'])
return data
# function to plot pie chart
def plot_pretty_pie_chart(label, value, title_name):
# create a plotly pie chart
fig = go.Figure(data=[go.Pie(labels=label, values=value)])
# write label and percentage inside/outside the chart
fig.update_traces(textinfo='label+percent')
# pull each pie slices a bit to make it look nicer
fig.update_traces(pull=[0.1, 0.1, 0.1, 0.1])
# assign bright color combinations
fig.update_traces(marker=dict(colors=['#286086', 'maroon', 'blue']))
# adjust the size of the figure
fig.update_layout(width=600, height=550)
# give title to the chart and align it to the center
fig.update_layout(title=title_name,title_x=0.5)
# play around with the font and its size of the body
fig.update_layout(font_family="Arial", font_size = 15)
# play around with the font and its size of the title
fig.update_layout(title = {'font_color':'navy', 'font_size':25})
# display the chart
fig.show()
df = df.copy(deep=True).astype({'emp_length_years': 'int64'})
Distribution of loan status
loan_status_df = build_df('loan_status')
loan_status_df.style.hide_index()
| loan_status | count |
|---|---|
| Fully Paid | 32950 |
| Charged Off | 5627 |
| Current | 1140 |
Pie chart showing distribution of loan status
loan_status_type = loan_status_df['loan_status']
loan_status_count = loan_status_df['count']
plot_pretty_pie_chart(loan_status_type, loan_status_count, 'Loan Status')
loan_type_df = build_df('loan_type')
loan_type_df.style.hide_index()
| loan_type | count |
|---|---|
| Good Loans | 34090 |
| Bad Loans | 5627 |
loan_type = loan_type_df['loan_type']
loan_type_count = loan_type_df['count']
plot_pretty_pie_chart(loan_type, loan_type_count, 'Good Loans and Bad Loans')
def generate_df_by_segment_column(segment_col):
loan_segment_by_col_df = pd.DataFrame(df.groupby([segment_col,'loan_type'])['loan_type'].count())
loan_segment_by_col_df['percent'] = loan_segment_by_col_df.groupby(level=0).apply(lambda x: round(100 * x / float(x.sum())))
loan_segment_by_col_df = loan_segment_by_col_df.rename(columns={'loan_type':'count'})
loan_segment_by_col_df['percent'] = loan_segment_by_col_df['percent'].apply(np.int64)
return loan_segment_by_col_df
def plot_grouped_bar_chart(data, x_value, y_value, legend, title_name):
df = data.reset_index()
fig = px.bar(data_frame=df, x=x_value, y=y_value, color=legend, labels={
x_value: x_value.capitalize(),
y_value: y_value.capitalize()
}, barmode="group", hover_data = {'percent':True,'count':True},color_discrete_sequence=['maroon','#286086'])
fig.update_layout(width=600, height=550)
fig.update_layout()
fig.update_layout(title=title_name,title_x=0.5)
fig.update_layout(legend_title_text='Loan type')
fig.update_layout(font_family="Arial", font_size = 15)
fig.update_layout(title = {'font_color':'navy', 'font_size':25})
fig.show()
Generate the segmented df
loan_segment_by_term_df = generate_df_by_segment_column('term')
loan_segment_by_term_df
| count | percent | ||
|---|---|---|---|
| term | loan_type | ||
| 36 months | Bad Loans | 3227 | 11 |
| Good Loans | 25869 | 89 | |
| 60 months | Bad Loans | 2400 | 23 |
| Good Loans | 8221 | 77 |
Plot grouped bar chart based on total count
plot_grouped_bar_chart(loan_segment_by_term_df, 'term', 'count', 'loan_type','Loan Segment by Term (Total)')
Plot grouped bar chart based on proportion
plot_grouped_bar_chart(loan_segment_by_term_df, 'term', 'percent', 'loan_type','Loan Segment by Term (%)')
Generate the segmented df
loan_segment_by_grade_df = generate_df_by_segment_column('grade')
loan_segment_by_grade_df
| count | percent | ||
|---|---|---|---|
| grade | loan_type | ||
| A | Bad Loans | 602 | 6 |
| Good Loans | 9483 | 94 | |
| B | Bad Loans | 1425 | 12 |
| Good Loans | 10595 | 88 | |
| C | Bad Loans | 1347 | 17 |
| Good Loans | 6751 | 83 | |
| D | Bad Loans | 1118 | 21 |
| Good Loans | 4189 | 79 | |
| E | Bad Loans | 715 | 25 |
| Good Loans | 2127 | 75 | |
| F | Bad Loans | 319 | 30 |
| Good Loans | 730 | 70 | |
| G | Bad Loans | 101 | 32 |
| Good Loans | 215 | 68 |
Generate grouped bar chart (total)
plot_grouped_bar_chart(loan_segment_by_grade_df, 'grade', 'count', 'loan_type', 'Loan Segment by Grade (Total)')
Generate grouped bar chart (%)
plot_grouped_bar_chart(loan_segment_by_grade_df, 'grade', 'percent', 'loan_type', 'Loan Segment by Grade (%)')
# helper functions to get good loan df
def get_segmented_good_loan_df(var_name):
data = generate_df_by_segment_column(var_name).reset_index()
data = data[data['loan_type']=='Good Loans']
return data.reset_index(drop=True)
# helper functions to get bad loan df
def get_segmented_bad_loan_df(var_name):
data = generate_df_by_segment_column(var_name).reset_index()
data = data[data['loan_type']=='Bad Loans']
return data
# helper function to plot line chart
def plot_line_chart(data, x_value, y_value, title_name,color):
df = data.reset_index()
fig = px.line(data_frame=df, x=x_value, y=y_value, labels={
x_value: 'Employment Length (Years)',
y_value: 'Percent'
}, hover_data = {'percent':True,'count':True},color_discrete_sequence=[color], orientation='h')
fig.update_layout(width=600, height=550)
fig.update_layout()
fig.update_layout(title=title_name,title_x=0.5)
#fig.update_layout(legend_title_text='Loan type')
fig.update_layout(font_family="Times New Roman", font_size = 15)
fig.update_layout(title = {'font_color':'navy', 'font_size':25})
fig.show()
print('------------------------\n\
-1 denotes missing values\n \
0 denotes less than 1 year\n \
10 denotes 10 or more years\n \
------------------------')
emp_length_good_loan_df = get_segmented_good_loan_df('emp_length_years')
#emp_length_good_loan_df.replace({'emp_length_years': {-1: 'Not mentioned', 0:'< 1', 10: '10+'}}, inplace=True)
emp_length_good_loan_df.style.hide_index()
------------------------ -1 denotes missing values 0 denotes less than 1 year 10 denotes 10 or more years ------------------------
| emp_length_years | loan_type | count | percent |
|---|---|---|---|
| -1 | Good Loans | 847 | 79 |
| 0 | Good Loans | 3944 | 86 |
| 1 | Good Loans | 2784 | 86 |
| 2 | Good Loans | 3821 | 87 |
| 3 | Good Loans | 3540 | 86 |
| 4 | Good Loans | 2974 | 87 |
| 5 | Good Loans | 2824 | 86 |
| 6 | Good Loans | 1922 | 86 |
| 7 | Good Loans | 1510 | 85 |
| 8 | Good Loans | 1276 | 86 |
| 9 | Good Loans | 1100 | 87 |
| 10 | Good Loans | 7548 | 85 |
print('------------------------\n\
-1 denotes missing values\n \
0 denotes less than 1 year\n \
10 denotes 10 or more years\n \
------------------------')
plot_line_chart(emp_length_good_loan_df, 'emp_length_years', 'percent','Good Loan and Employment Duration','#286086')
------------------------ -1 denotes missing values 0 denotes less than 1 year 10 denotes 10 or more years ------------------------
print('------------------------\n\
-1 denotes missing values\n \
0 denotes less than 1 year\n \
10 denotes 10 or more years\n \
------------------------')
emp_length_bad_loan_df = get_segmented_bad_loan_df('emp_length_years')
#emp_length_bad_loan_df.replace({'emp_length_years': {-1: 'Not mentioned', 0:'< 1', 10: '10+'}}, inplace=True)
emp_length_bad_loan_df.style.hide_index()
------------------------ -1 denotes missing values 0 denotes less than 1 year 10 denotes 10 or more years ------------------------
| emp_length_years | loan_type | count | percent |
|---|---|---|---|
| -1 | Bad Loans | 228 | 21 |
| 0 | Bad Loans | 639 | 14 |
| 1 | Bad Loans | 456 | 14 |
| 2 | Bad Loans | 567 | 13 |
| 3 | Bad Loans | 555 | 14 |
| 4 | Bad Loans | 462 | 13 |
| 5 | Bad Loans | 458 | 14 |
| 6 | Bad Loans | 307 | 14 |
| 7 | Bad Loans | 263 | 15 |
| 8 | Bad Loans | 203 | 14 |
| 9 | Bad Loans | 158 | 13 |
| 10 | Bad Loans | 1331 | 15 |
print('------------------------\n\
-1 denotes missing values\n \
0 denotes less than 1 year\n \
10 denotes 10 or more years\n \
------------------------')
plot_line_chart(emp_length_bad_loan_df, 'emp_length_years', 'percent','Bad Loan and Employment Duration','maroon')
------------------------ -1 denotes missing values 0 denotes less than 1 year 10 denotes 10 or more years ------------------------
get_segmented_good_loan_df = dict(df['emp_title'].value_counts())
# convert dictionary to pandas dataframe
data = pd.DataFrame(get_segmented_good_loan_df.items(),columns = ['emp_title','count'])
print('The top employers for LC borrowers:')
emp_title_df = build_df('emp_title').head(15)
emp_title_df.style.hide_index()
The top employers for LC borrowers:
| emp_title | count |
|---|---|
| employer not mentioned | 2459 |
| us army | 212 |
| bank of america | 138 |
| at&t | 83 |
| walmart | 82 |
| wells fargo | 71 |
| ibm | 69 |
| kaiser permanente | 69 |
| verizon wireless | 65 |
| ups | 63 |
| self | 60 |
| usaf | 58 |
| usps | 58 |
| us air force | 57 |
| self employed | 57 |
Generate df
bad_loan_purpose_df = get_segmented_bad_loan_df('purpose').reset_index(drop=True).sort_values(by=['percent'], ascending=False)
bad_loan_purpose_df.style.hide_index()
| purpose | loan_type | count | percent |
|---|---|---|---|
| small_business | Bad Loans | 475 | 26 |
| renewable_energy | Bad Loans | 19 | 18 |
| educational | Bad Loans | 56 | 17 |
| moving | Bad Loans | 92 | 16 |
| other | Bad Loans | 633 | 16 |
| debt_consolidation | Bad Loans | 2767 | 15 |
| house | Bad Loans | 59 | 15 |
| medical | Bad Loans | 106 | 15 |
| vacation | Bad Loans | 53 | 14 |
| home_improvement | Bad Loans | 347 | 12 |
| credit_card | Bad Loans | 542 | 11 |
| car | Bad Loans | 160 | 10 |
| major_purchase | Bad Loans | 222 | 10 |
| wedding | Bad Loans | 96 | 10 |
Generate horizontal bar chart
fig = px.bar(bad_loan_purpose_df, x="percent", y="purpose", labels={
'percent': 'Percent',
'purpose': 'Purpose'}, orientation='h', hover_data = {'count':True})
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.update_layout(width=700, height=550)
fig.update_traces(marker_color='maroon')
fig.update_layout(font_family="Arial", font_size = 15)
fig.update_layout(title='Purpose for Bad Loans',title_x=0.5)
fig.update_layout(title = {'font_color':'Navy', 'font_size':25})
fig.show()